{
 "cells": [
  {
   "cell_type": "markdown",
   "source": [
    "## 4.2.4 多分类到二分类的实现"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "outputs": [],
   "source": [
    "import numpy as np"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-06T15:16:11.161828800Z",
     "start_time": "2023-05-06T15:16:11.061828700Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "outputs": [],
   "source": [
    "class Embedding:\n",
    "    def __init__(self, W):\n",
    "        self.params = [W]\n",
    "        self.grads = [np.zeros_like(W)]\n",
    "        self.idx = None\n",
    "\n",
    "    def forward(self, idx):\n",
    "        W, = self.params\n",
    "        self.idx = idx\n",
    "        out = W[idx]\n",
    "        return out\n",
    "\n",
    "    def backward(self, dout):\n",
    "        dW, = self.grads\n",
    "        dW[...] = 0\n",
    "\n",
    "        for i, word_id in enumerate(self.idx):\n",
    "            dW[word_id] += dout[i]\n",
    "        # 或者\n",
    "        # np.add.at(dW, self.idx, dout)\n",
    "        return None"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-06T15:16:11.773559300Z",
     "start_time": "2023-05-06T15:16:11.764562100Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true,
    "ExecuteTime": {
     "end_time": "2023-05-06T15:36:37.038607800Z",
     "start_time": "2023-05-06T15:36:37.028613500Z"
    }
   },
   "outputs": [],
   "source": [
    "class EmbeddingDot:\n",
    "    def __init__(self, W):\n",
    "        self.embed = Embedding(W)\n",
    "        self.params = self.embed.params\n",
    "        self.gards = self.embed.grads\n",
    "        self.cache = None\n",
    "\n",
    "    def forward(self, h, idx):\n",
    "        target_W = self.embed.forward(idx)\n",
    "        out = np.sum(target_W * h, axis=1)\n",
    "\n",
    "        self.cache = (h, target_W)\n",
    "        return out\n",
    "\n",
    "    def backward(self, dout):\n",
    "        h, target_W = self.cache\n",
    "        dout = dout.reshape(dout.shape[0], 1)\n",
    "\n",
    "        dtarget_W = dout * h\n",
    "        self.embed.backward(dtarget_W)\n",
    "        dh = dout * target_W\n",
    "        return dh"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "EmbeddingDot 类共有 4 个成员变量：embed、params、grads 和 cache。根据本书的代码规范，params 保存参数，grads 保存梯度。另外，作为缓存，embed 保存 Embedding 层，cache 保存正向传播时的计算结果\n",
    "\n",
    "正向传播的 forward(h, idx) 方法的参数接收中间层的神经元（h）和单词 ID 的 NumPy 数组（idx）。这里，idx 是单词 ID 列表，这是因为我们假定了对数据进行 mini-batch 处理\n",
    "\"\"\""
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "## 4.2.6 负采样的采样方法"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "outputs": [
    {
     "data": {
      "text/plain": "5"
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 从0到9的数字中随机选择一个数字\n",
    "np.random.choice(10)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-06T15:57:34.616376500Z",
     "start_time": "2023-05-06T15:57:34.582377400Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "outputs": [
    {
     "data": {
      "text/plain": "1"
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.random.choice(10)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-06T15:57:43.722669100Z",
     "start_time": "2023-05-06T15:57:43.667039300Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "outputs": [],
   "source": [
    "# 从words列表中随机选择一个元素\n",
    "words = ['you', 'say', 'goodbye', 'I', 'hello', '.']"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-06T15:57:51.304160200Z",
     "start_time": "2023-05-06T15:57:51.293160300Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "outputs": [
    {
     "data": {
      "text/plain": "'.'"
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.random.choice(words)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-06T15:57:56.579051700Z",
     "start_time": "2023-05-06T15:57:56.533985500Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "outputs": [
    {
     "data": {
      "text/plain": "array(['.', 'you', 'you', 'goodbye', 'say'], dtype='<U7')"
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 有放回采样5次\n",
    "np.random.choice(words, size=5)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-06T15:58:16.699194600Z",
     "start_time": "2023-05-06T15:58:16.683195800Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "outputs": [
    {
     "data": {
      "text/plain": "array(['goodbye', 'hello', '.', 'I', 'say'], dtype='<U7')"
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 无放回采样5次\n",
    "np.random.choice(words, size=5, replace=False)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-06T15:58:28.407091100Z",
     "start_time": "2023-05-06T15:58:28.392092300Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "outputs": [
    {
     "data": {
      "text/plain": "'I'"
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 基于概率分布进行采样\n",
    "p = [0.5, 0.1, 0.05, 0.2, 0.05, 0.1]\n",
    "np.random.choice(words, p=p)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-06T15:58:51.504162Z",
     "start_time": "2023-05-06T15:58:51.476087400Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "如上所示，np.random.choice() 可以用于随机抽样。如果指定 size 参数，将执行多次采样。如果指定 replace=False，将进行无放回采样。通过给参数 p 指定表示概率分布的列表，将进行基于概率分布的采样。剩下的就是使用这个函数抽取负例\n",
    "\"\"\""
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "word2vec 中提出的负采样对刚才的概率分布增加了一个步骤，对原来的概率分布取 0.75 次方\n",
    "通过取 0.75 次方，低频单词的概率将稍微变高\n",
    "\"\"\""
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "outputs": [],
   "source": [
    "p = [0.7, 0.29, 0.01]"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-06T16:00:30.412753500Z",
     "start_time": "2023-05-06T16:00:30.399752900Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "outputs": [],
   "source": [
    "new_p = np.power(p, 0.75)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-06T16:00:42.750320500Z",
     "start_time": "2023-05-06T16:00:42.723849200Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "outputs": [],
   "source": [
    "new_p /= np.sum(new_p)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-06T16:00:50.592750700Z",
     "start_time": "2023-05-06T16:00:50.587749400Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0.64196878 0.33150408 0.02652714]\n"
     ]
    }
   ],
   "source": [
    "print(new_p)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-06T16:00:55.451801800Z",
     "start_time": "2023-05-06T16:00:55.446802800Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "outputs": [],
   "source": [
    "from negative_sampling_layer import UnigramSampler"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-06T16:11:33.759408100Z",
     "start_time": "2023-05-06T16:11:33.744407300Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "在进行初始化时，UnigramSampler 类取 3 个参数，分别是单词 ID 列表格式的 corpus、对概率分布取的次方值 power（默认值是 0.75）和负例的采样个数 sample_size。UnigramSampler 类有 get_negative_sample(target) 方法，该方法以参数 target 指定的单词 ID 为正例，对其他的单词 ID 进行采样。\n",
    "\"\"\""
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "outputs": [],
   "source": [
    "corpus = np.array([0, 1, 2, 3, 4, 1, 2, 3])\n",
    "power = 0.75\n",
    "sample_size = 2"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-06T16:11:58.998140400Z",
     "start_time": "2023-05-06T16:11:58.993141600Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "outputs": [],
   "source": [
    "sampler = UnigramSampler(corpus, power, sample_size)\n",
    "target = np.array([1, 3, 0])\n",
    "negative_sample = sampler.get_negative_sample(target)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-06T16:13:07.898182100Z",
     "start_time": "2023-05-06T16:13:07.887181500Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[4 2]\n",
      " [1 4]\n",
      " [1 3]]\n"
     ]
    }
   ],
   "source": [
    "print(negative_sample)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-06T16:13:13.821492500Z",
     "start_time": "2023-05-06T16:13:13.810491200Z"
    }
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "## 4.2.7  负采样的实现"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "outputs": [],
   "source": [
    "from common.layers import Embedding, SigmoidWithLoss"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-06T16:19:28.757190200Z",
     "start_time": "2023-05-06T16:19:28.739191700Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "outputs": [],
   "source": [
    "class NegativeSamplingLoss:\n",
    "    def __init__(self, W, corpus, power=0.75, simple_size=5):\n",
    "        self.simple_size = simple_size\n",
    "        self.sampler = UnigramSampler(corpus, power, sample_size)\n",
    "        self.loss_layers = [SigmoidWithLoss() for _ in range(sample_size + 1)]\n",
    "        self.embed_dot_layers = [EmbeddingDot(W) for _ in range(simple_size + 1)]\n",
    "        self.params, self.grads = [], []\n",
    "        for layer in self.embed_dot_layers:\n",
    "            self.params += layer.params\n",
    "            self.grads += layer.gards\n",
    "\n",
    "    def forward(self, h, target):\n",
    "     batch_size = target.shape[0]\n",
    "     negative_sample = self.sampler.get_negative_sample(target)\n",
    "\n",
    "     # 正例的正向传播\n",
    "     score = self.embed_dot_layers[0].forward(h, target)\n",
    "     correct_label = np.ones(batch_size, dtype=np.int32)\n",
    "     loss = self.loss_layers[0].forward(score, correct_label)\n",
    "\n",
    "     # 负例的正向传播\n",
    "     negative_label = np.zeros(batch_size, dtype=np.int32)\n",
    "     for i in range(self.sample_size):\n",
    "         negative_target = negative_sample[:, i]\n",
    "         score = self.embed_dot_layers[1 + i].forward(h, negative_target)\n",
    "         loss += self.loss_layers[1 + i].forward(score, negative_label)\n",
    "\n",
    "     return loss\n",
    "\n",
    "    def backward(self, dout=1):\n",
    "         dh = 0\n",
    "         for l0, l1 in zip(self.loss_layers, self.embed_dot_layers):\n",
    "             dscore = l0.backward(dout)\n",
    "             dh += l1.backward(dscore)\n",
    "         return dh"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-06T16:28:56.952054700Z",
     "start_time": "2023-05-06T16:28:56.939055Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [],
   "metadata": {
    "collapsed": false
   }
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
